Downloading data:

Downloading rasters

First we create a folder in our working directory to include input data.

# Create folder to store inputs.
if(!dir_exists('input_data')){
  dir_create('input_data')
}

Now let’s download data from WorldClim 2.1. Note that when R downloads a file it has a timeout of 60 seconds. This may not be enough to download environmental data, so we can set options(timeout=n), where n is the number of seconds we need to download the data.

options(timeout=600)

# Current data
# Files are automatically saved in input_data folder.
WorldClim_data('current', variable = 'bioc', resolution = 5)
## [1] "The file for current scenario is already downloaded."
# Future data
WorldClim_data('future', variable = 'bioc', year = '2090', gcm = 'mi', ssp = c('245','585'), resolution = 5) 
## [1] "The file for future scenario (input_data/WorldClim_data_future/mi_ssp245_5m_2090.tif) is already downloaded."
## [1] "The file for future scenario (input_data/WorldClim_data_future/mi_ssp585_5m_2090.tif) is already downloaded."

Obtaining occurrence data from GBIF:

# Downloading data from GBIF
# File is automatically saved in input_data folder
# spp_data <- GBIF_data('Colossoma macropomum')

We will use our own data:

spp_data <- here('input_data/spp_own_data.csv')

Downloading shapefile for study area

# Obtaining Natural Earth data:
#shape_study_area <- rnaturalearth::ne_download(scale = 50, type = "rivers_lake_centerlines", category = "physical")

Geoprocessing:

Open Files and Data

Firstly, we name inputs and outputs, caring for using the correct extensions. a) Inputs:

# Shapefile (polygon or lines) delimiting study area.
shape_study_area <- here("input_data/shape_study_area/AmazonHydroRivers4.shp")  

# Directory name containing current rasters to be rescaled.
folder_current_rasters <- here("input_data/WorldClim_data_current/")

# Directory name containing future rasters to be rescaled.
folder_future_rasters <- here("input_data/WorldClim_data_future/")
  1. Outputs:
# Name of shapefile (.shp) for the study area to be saved.
output_shp_study_area <- here("output_data/grid/Colossoma_grid.shp")

# Name of R object (.rds) where the current rescaled variables will be saved.
output_shp_current <- here("output_data/WorldClim_data_current_rescaled/Colossoma_current.rds")

# Name of R objects (.rds) where the future rescaled variables will be saved.
scenarios <- c('ssp245', 'ssp585') # Scenarios names
output_shp_future <- here(paste0("output_data/WorldClim_data_future_rescaled/",scenarios,".rds"))
  1. Seting up some important variables:
# Cell hight and width for the grid.
# This value depends on rasters projection. If rasters are in UTM, values will be in meters. If rasters are in decimal degrees (as in WorldClim 2.1), values will be in degrees.
# Following values build a grid with 0.1 x 0.1 degree.
cell_width = 0.1 
cell_height = 0.1
# Note that setting those values to build a grid smaller than the input rasters may generate NaNs, causing some problems in the future.

# If you have any variable in shape_study_area that you want to keep for rescaling, you can set here.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
names_var_shp_study_area <-  list()

# As in the codeline above, here we set which variables in current rasters we want to keep for rescaling.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
current_var_names <- paste0('bio_', 1:19) # or NULL

# As in the codelines above, here we set which variables in future rasters we want to keep for rescaling.
# We will usually need at least the same variables as in the current scenario for projection.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
future_var_names <-  current_var_names 

Study Area

The map of study area needs to be imported to R, so we can create a grid for the study area. This grid will be used for model building and projections.

shape_study_area %<>%
  st_read() %>%
  repair_shp()
## Reading layer `AmazonHydroRivers4' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/Colossoma copy/input_data/shape_study_area/AmazonHydroRivers4.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 107294 features and 14 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -79.37708 ymin: -20.21042 xmax: -48.44375 ymax: 5.139583
## Geodetic CRS:  WGS 84
if (output_shp_study_area %>% file_exists()== F){
  grid_study_area <- shape_study_area %>% 
      make_grid(cell_width, cell_height, names_var_shp_study_area)
  
  output_shp_study_area %>% 
    path_dir() %>% 
    dir_create()
  
  grid_study_area %>% st_write(
      dsn = output_shp_study_area)
} else {
  grid_study_area <- output_shp_study_area %>% st_read()
}
## Writing layer `Colossoma_grid' to data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/Colossoma copy/output_data/grid/Colossoma_grid.shp' using driver `ESRI Shapefile'
## Writing 23811 features with 3 fields and geometry type Polygon.

Rescaling variables

The next step aims to cross data from study area with rasters of variables. We will start with current data.

### Rescaling current data
if (output_shp_current %>% file_exists() == F) {
  grid_current <- grid_study_area %>% 
    add_raster(folder_current_rasters, current_var_names) 
    
  output_shp_current %>% 
    path_dir() %>% 
    dir_create()
  
  grid_current %>% saveRDS(output_shp_current)
} else {
  grid_current <- output_shp_current %>% readRDS()
}

Now within a loop to rescale future variables.

### Rescaling future data
if (!all(output_shp_future %>% file_exists())) {
  for (i in 1:length(scenarios)) {
    grid_future <- grid_study_area %>% 
      add_raster(folder_future_rasters, future_var_names, scenarios[i]) 
      
    output_shp_future[i] %>% 
      path_dir() %>% 
      dir_create()
    
    grid_future %>% saveRDS(output_shp_future[i])
  }
}

grid_future <- lapply(output_shp_future,function(x){readRDS(x)})
names(grid_future) <- scenarios

print(grid_future)
## $ssp245
## Simple feature collection with 23811 features and 22 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.3 ymin: -20.2 xmax: -48.4 ymax: 5.2
## Geodetic CRS:  WGS 84
## First 10 features:
##    cell_id x_centroid y_centroid bio_1 bio_2 bio_3  bio_4 bio_5 bio_6 bio_7
## 1      154     -63.95     -20.15  24.0  12.2 59.90 307.00 33.30 12.90 20.40
## 2      155     -63.85     -20.15  24.4  12.4 59.60 315.30 33.90 13.10 20.80
## 3      156     -63.75     -20.15  24.5  12.5 59.50 320.35 34.05 13.05 21.00
## 4      157     -63.65     -20.15  24.5  12.5 59.30 322.50 34.10 13.00 21.10
## 5      159     -63.45     -20.15  25.4  13.2 59.80 337.50 35.40 13.30 22.10
## 6      467     -63.65     -20.05  24.9  12.6 59.50 325.10 34.50 13.40 21.10
## 7      468     -63.55     -20.05  24.8  12.8 60.10 323.90 34.40 13.10 21.30
## 8      469     -63.45     -20.05  25.3  13.1 60.00 330.60 35.20 13.40 21.80
## 9      470     -63.35     -20.05  25.5  13.3 60.40 333.90 35.50 13.40 22.10
## 10     471     -63.25     -20.05  25.6  13.1 60.35 331.90 35.40 13.65 21.75
##    bio_8 bio_9 bio_10 bio_11 bio_12 bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 1   27.2 21.60  27.00  19.80  726.2  120.5    8.3   74.1 356.60   34.4  310.6
## 2   27.6 21.80  27.50  20.10  710.4  119.5    7.0   75.4 351.50   30.3  304.5
## 3   27.7 21.95  27.55  20.10  687.9  116.9    5.9   76.4 342.35   26.7  295.2
## 4   27.8 21.90  27.60  20.10  663.1  114.8    5.0   77.8 333.90   23.8  286.1
## 5   28.7 22.70  28.70  20.80  619.6  108.5    4.0   79.5 315.00   19.0  266.5
## 6   28.1 22.30  28.00  20.40  663.5  114.0    5.0   77.4 333.80   24.3  285.8
## 7   28.0 22.10  27.90  20.30  652.9  113.5    4.8   77.9 329.50   23.3  282.0
## 8   28.6 22.70  28.50  20.70  622.4  108.5    4.3   79.2 316.80   20.1  268.8
## 9   28.9 20.30  28.80  21.00  599.6  106.0    4.0   80.1 308.50   18.8  259.8
## 10  29.0 21.70  28.80  21.05  576.4  102.9    3.4   80.8 299.20   16.8  250.2
##    bio_19                       geometry
## 1   54.30 POLYGON ((-64 -20.2, -63.9 ...
## 2   51.40 POLYGON ((-63.9 -20.2, -63....
## 3   48.05 POLYGON ((-63.8 -20.2, -63....
## 4   44.10 POLYGON ((-63.7 -20.2, -63....
## 5   39.30 POLYGON ((-63.5 -20.2, -63....
## 6   45.80 POLYGON ((-63.7 -20.1, -63....
## 7   42.80 POLYGON ((-63.6 -20.1, -63....
## 8   39.50 POLYGON ((-63.5 -20.1, -63....
## 9   37.30 POLYGON ((-63.4 -20.1, -63....
## 10  35.55 POLYGON ((-63.3 -20.1, -63....
## 
## $ssp585
## Simple feature collection with 23811 features and 22 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.3 ymin: -20.2 xmax: -48.4 ymax: 5.2
## Geodetic CRS:  WGS 84
## First 10 features:
##    cell_id x_centroid y_centroid bio_1 bio_2 bio_3  bio_4 bio_5 bio_6 bio_7
## 1      154     -63.95     -20.15 25.70 12.30 59.00 312.30 35.40 14.50 20.90
## 2      155     -63.85     -20.15 26.10 12.50 58.80 323.40 35.90 14.70 21.20
## 3      156     -63.75     -20.15 26.15 12.55 58.55 327.55 36.10 14.65 21.45
## 4      157     -63.65     -20.15 26.10 12.50 58.00 329.00 36.20 14.60 21.60
## 5      159     -63.45     -20.15 27.00 13.20 58.10 346.90 37.60 14.90 22.70
## 6      467     -63.65     -20.05 26.50 12.60 58.20 332.70 36.60 15.00 21.60
## 7      468     -63.55     -20.05 26.40 12.80 58.30 332.70 36.60 14.70 21.90
## 8      469     -63.45     -20.05 26.90 13.10 58.00 342.90 37.40 14.90 22.50
## 9      470     -63.35     -20.05 27.10 13.30 58.30 344.80 37.80 15.00 22.80
## 10     471     -63.25     -20.05 27.15 13.05 58.00 344.45 37.65 15.15 22.50
##    bio_8 bio_9 bio_10 bio_11 bio_12 bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 1  28.70 23.60  28.70   21.3 764.20 127.30    7.3  74.80 371.90  32.40  329.9
## 2  29.10 23.90  29.20   21.5 747.90 125.00    7.0  75.60 365.00  29.50  246.8
## 3  29.15 23.90  29.25   21.5 724.20 121.15    5.9  76.30 354.80  26.70  239.9
## 4  29.10 23.90  29.20   21.5 696.30 116.80    5.0  77.40 343.60  23.80  231.3
## 5  30.10 24.70  30.30   22.1 649.80 110.30    3.0  79.00 323.30  18.00  212.5
## 6  29.50 24.30  29.60   21.8 697.10 117.80    5.0  76.80 343.10  24.30  228.8
## 7  29.40 24.10  29.50   21.7 684.50 115.30    4.8  77.10 337.80  23.60  226.8
## 8  30.00 24.60  30.10   22.0 652.30 110.80    3.8  78.30 324.10  19.60  213.8
## 9  30.30 24.90  30.40   22.3 627.40 107.30    3.0  79.10 314.30  17.80  204.8
## 10 30.30 24.85  30.40   22.3 603.35 103.40    3.0  79.25 303.85  16.65  197.7
##    bio_19                       geometry
## 1   55.30 POLYGON ((-64 -20.2, -63.9 ...
## 2   52.60 POLYGON ((-63.9 -20.2, -63....
## 3   50.05 POLYGON ((-63.8 -20.2, -63....
## 4   46.10 POLYGON ((-63.7 -20.2, -63....
## 5   42.00 POLYGON ((-63.5 -20.2, -63....
## 6   48.80 POLYGON ((-63.7 -20.1, -63....
## 7   45.30 POLYGON ((-63.6 -20.1, -63....
## 8   42.50 POLYGON ((-63.5 -20.1, -63....
## 9   40.30 POLYGON ((-63.4 -20.1, -63....
## 10  38.55 POLYGON ((-63.3 -20.1, -63....

Occurrence data

Open files with data

It is necessary to name the output. Be extra careful with extension names. a) Output:

#  Set the path to the output shapefile, which will contain the presence/absence matrix.
spp_output <- here("output_data/Colossoma_pa.shp")

It is also necessary to set some other important parameters.

# Species names to be used in the study. 
# Names should be identical from input/spp_data.csv obtained previously.
# Setting this to NULL will use all species.
spp_names <- 'Colossoma.macropomum' # or NULL

# Set which CRS projection is the grid_study_area.
crs_occ_data <- "+init=epsg:4326"

Importing occurrence data

occ_shp <- spp_data %>% 
  occurrences_to_shapefile(spp_names, grid_study_area, CRS(crs_occ_data))

plot(occ_shp)

Occurrence grid for the Study Area

We will say to the grid_study_area which cells have an occurrence record for the studied species.

if (spp_output %>% file_exists() == F) {
  grid_matrix_pa <- occ_shp %>% 
    occurrences_to_pa_shapefile(grid_study_area, spp_names)
  
  spp_output %>% 
    path_dir() %>% 
    dir_create()
  
  grid_matrix_pa %<>% select(-c('x_centroid', 'y_centroid', 'cell_id'))
  
  grid_matrix_pa %>% st_write(dsn = spp_output)
} else {
  grid_matrix_pa <- spp_output %>% st_read()
}
## Writing layer `Colossoma_pa' to data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/Colossoma copy/output_data/Colossoma_pa.shp' using driver `ESRI Shapefile'
## Writing 23811 features with 1 fields and geometry type Polygon.
grid_matrix_pa %>% mutate(across(!geometry, as.factor)) %>%
  ggplot() +
      geom_sf(aes(fill = !!sym(colnames(grid_matrix_pa)[-length(colnames(grid_matrix_pa))])), color=NA) +
      scale_fill_brewer(palette = "Set2")   

Check how many records there is to each species:

sapply(colnames(grid_matrix_pa)[-ncol(grid_matrix_pa)], 
       function(x){
          sum(as.data.frame(grid_matrix_pa)[,x])},
       USE.NAMES = T)
## clssm_mcrp 
##        220

Variable Selection - VIF

Variable Selection must be done OR with a VIF routine, OR with a PCA. To perform a VIF routine, we will use only the environmental data from the occurrence data.

# Obtain occurrence data.frame and environmental variables:
df_vif <- get_df_vif(grid_current, grid_matrix_pa)
## var_names not informed. Variables detected:  bio_1, bio_2, bio_3, bio_4, bio_5, bio_6, bio_7, bio_8, bio_9, bio_10, bio_11, bio_12, bio_13, bio_14, bio_15, bio_16, bio_17, bio_18, bio_19
# Run VIF routine from usdm package:
vif_bio <- df_vif %>% 
             vifcor(th=0.5)
vif_bio
## 15 variables from the 19 input variables have collinearity problem: 
##  
## bio_17 bio_16 bio_7 bio_15 bio_6 bio_11 bio_12 bio_1 bio_2 bio_4 bio_9 bio_14 bio_19 bio_10 bio_5 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( bio_8 ~ bio_3 ):  -0.02141673 
## max correlation ( bio_13 ~ bio_8 ):  -0.3385438 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1     bio_3 1.026042
## 2     bio_8 1.177751
## 3    bio_13 1.208131
## 4    bio_18 1.080521
# We can exclude variables with high VIF or go to the next step.
grid_current_vif <- select(grid_current, !vif_bio@excluded)

Variable Selection - PCA

  1. Outputs:
# Name of the shapefile in which the PCA values will be stored.
shp_pca <- here("output_data/pca/shp_pca.rds")

# Name of the shapefile in which current PCA projection will be stored.
shp_preds <- here("output_data/pca/shp_preds.rds")

# Name of the shapefile in which future PCA projection will be stored.
shp_preds_future <- here("output_data/pca/shp_preds_future.rds")
  1. Transforming objects to be used in the next steps:
shp_matrix_pa <-  spp_output %>%
  st_read()
## Reading layer `Colossoma_pa' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/Colossoma copy/output_data/Colossoma_pa.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 23811 features and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.3 ymin: -20.2 xmax: -48.4 ymax: 5.2
## Geodetic CRS:  WGS 84
df_species <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_var_preditors <- output_shp_current %>%
  get_predictors_as_df()

Control Variables

# Names of variables to be normalized (centered and scaled).
# Normalization can improve the modelling, the calculation of PCAs and the clusterization algorithms used to generate pseudoabsences.
var_normalization <- current_var_names # OR: paste0("bio_",1:19)

# Names of variables to be used in PCA.
var_pca_bio <- current_var_names # OR: paste0("bio_",1:19)

# Number of PCA-axes to be retained.
nr_comp_pca_bio <- 4

# Names of variables to be used as predictors when training the models. It can be environmental variables or PCA-axes.
var_predictors <-c("dim_1_bio","dim_2_bio","dim_3_bio","dim_4_bio")

Preparing Predictor Variables

Standardizing and normalizing

df_potential_predictors <- df_species %>%
  bind_cols(df_var_preditors)

df_potential_predictors <- df_potential_predictors %>% 
  center_scale(var_normalization)

df_potential_predictors %>% 
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

Preparing Variables for PCA

df_var_pca <- df_potential_predictors %>% 
  select(var_pca_bio[which(var_pca_bio %in% colnames(df_potential_predictors))])

df_var_pca %>%
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

PCA analysis for variables.

Correlation Matrix

df_var_pca %>% 
  corr_plot()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

PCA analysis

The table that follows shows the values of the axes and variables. A tabela a seguir mostra os valores dos eixos e variaveis.

if (shp_pca %>% file_exists() == T) {
  file_delete(shp_pca)
}
pca_bio <- df_var_pca %>% 
  calc_pca(nr_comp_pca_bio, "bio")

pca_bio$var$loadings %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))
if(!dir_exists('output_data/pca/')){
  dir_create('output_data/pca/')
}

grid_study_area %>% 
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) %>% 
  saveRDS(file = shp_pca)

Summary of dimensions. The following table shows the correlation between variables with axes (i.e. the significance from each variable to given axle).

pca_bio %>% 
  dt_pca_summ()

Variable Contribution

pca_bio %>% 
  contrib_scree()

pca_bio %>% 
  contrib_corr()

pca_bio %>% 
  contrib_dims()

Quality of Variables’ Representation

PCA-axes considering the quality of variables’ representation.

pca_bio %>% 
  pca_cos2()

Quality of variables’ representation on axes.

pca_bio %>% 
  cos2_dims()

pca_bio %>% 
  cos2_corr()

pca_bio %>% 
  pca_bi_plot(df_species %>% rowMeans() %>% ceiling())

pca_bio %>% 
  comp_pca_retain()
## $broken.stick
## [1] 2
## 
## $kaiser.mean
## [1] 4
## 
## $horn.p
## [1] 4

Generating Shapefile with Predictors

Jointing PCA to variables

df_potential_predictors <- df_potential_predictors %>%
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) 

df_potential_predictors %>% 
  head() %>% 
  round(4) %>%
  datatable(options = list(pageLength = 10, scrollX=T))
if (shp_preds %>% file_exists() == F){
  df_var_preditors <- df_potential_predictors %>% 
    select(var_predictors %>% all_of())
  
  grid_study_area %>% 
    select(cell_id) %>% 
    bind_cols(df_var_preditors) %>% 
    saveRDS(file = shp_preds)
}

Projecting PCA-axes to the future scenarios.

pca_future <- proj_pca(pca_bio, grid_future)
pca_future
## $ssp245
## Simple feature collection with 23811 features and 26 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.3 ymin: -20.2 xmax: -48.4 ymax: 5.2
## Geodetic CRS:  WGS 84
## First 10 features:
##    cell_id x_centroid y_centroid bio_1 bio_2 bio_3  bio_4 bio_5 bio_6 bio_7
## 1      154     -63.95     -20.15  24.0  12.2 59.90 307.00 33.30 12.90 20.40
## 2      155     -63.85     -20.15  24.4  12.4 59.60 315.30 33.90 13.10 20.80
## 3      156     -63.75     -20.15  24.5  12.5 59.50 320.35 34.05 13.05 21.00
## 4      157     -63.65     -20.15  24.5  12.5 59.30 322.50 34.10 13.00 21.10
## 5      159     -63.45     -20.15  25.4  13.2 59.80 337.50 35.40 13.30 22.10
## 6      467     -63.65     -20.05  24.9  12.6 59.50 325.10 34.50 13.40 21.10
## 7      468     -63.55     -20.05  24.8  12.8 60.10 323.90 34.40 13.10 21.30
## 8      469     -63.45     -20.05  25.3  13.1 60.00 330.60 35.20 13.40 21.80
## 9      470     -63.35     -20.05  25.5  13.3 60.40 333.90 35.50 13.40 22.10
## 10     471     -63.25     -20.05  25.6  13.1 60.35 331.90 35.40 13.65 21.75
##    bio_8 bio_9 bio_10 bio_11 bio_12 bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 1   27.2 21.60  27.00  19.80  726.2  120.5    8.3   74.1 356.60   34.4  310.6
## 2   27.6 21.80  27.50  20.10  710.4  119.5    7.0   75.4 351.50   30.3  304.5
## 3   27.7 21.95  27.55  20.10  687.9  116.9    5.9   76.4 342.35   26.7  295.2
## 4   27.8 21.90  27.60  20.10  663.1  114.8    5.0   77.8 333.90   23.8  286.1
## 5   28.7 22.70  28.70  20.80  619.6  108.5    4.0   79.5 315.00   19.0  266.5
## 6   28.1 22.30  28.00  20.40  663.5  114.0    5.0   77.4 333.80   24.3  285.8
## 7   28.0 22.10  27.90  20.30  652.9  113.5    4.8   77.9 329.50   23.3  282.0
## 8   28.6 22.70  28.50  20.70  622.4  108.5    4.3   79.2 316.80   20.1  268.8
## 9   28.9 20.30  28.80  21.00  599.6  106.0    4.0   80.1 308.50   18.8  259.8
## 10  29.0 21.70  28.80  21.05  576.4  102.9    3.4   80.8 299.20   16.8  250.2
##    bio_19 dim_1_bio dim_2_bio  dim_3_bio dim_4_bio
## 1   54.30 -6.999700  2.197775 -0.4198756  3.259141
## 2   51.40 -6.924676  2.531408 -0.3663595  3.417784
## 3   48.05 -6.985559  2.656438 -0.3786148  3.474299
## 4   44.10 -7.048562  2.741864 -0.4122429  3.489778
## 5   39.30 -6.873205  3.422379 -0.3772380  3.867292
## 6   45.80 -6.850728  2.928511 -0.4365421  3.599186
## 7   42.80 -6.981409  2.875291 -0.4242578  3.574249
## 8   39.50 -6.845315  3.297870 -0.4487805  3.781792
## 9   37.30 -7.027456  3.342513 -0.4020257  3.879015
## 10  35.55 -6.856431  3.433748 -0.5935038  3.853050
##                          geometry
## 1  POLYGON ((-64 -20.2, -63.9 ...
## 2  POLYGON ((-63.9 -20.2, -63....
## 3  POLYGON ((-63.8 -20.2, -63....
## 4  POLYGON ((-63.7 -20.2, -63....
## 5  POLYGON ((-63.5 -20.2, -63....
## 6  POLYGON ((-63.7 -20.1, -63....
## 7  POLYGON ((-63.6 -20.1, -63....
## 8  POLYGON ((-63.5 -20.1, -63....
## 9  POLYGON ((-63.4 -20.1, -63....
## 10 POLYGON ((-63.3 -20.1, -63....
## 
## $ssp585
## Simple feature collection with 23811 features and 26 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.3 ymin: -20.2 xmax: -48.4 ymax: 5.2
## Geodetic CRS:  WGS 84
## First 10 features:
##    cell_id x_centroid y_centroid bio_1 bio_2 bio_3  bio_4 bio_5 bio_6 bio_7
## 1      154     -63.95     -20.15 25.70 12.30 59.00 312.30 35.40 14.50 20.90
## 2      155     -63.85     -20.15 26.10 12.50 58.80 323.40 35.90 14.70 21.20
## 3      156     -63.75     -20.15 26.15 12.55 58.55 327.55 36.10 14.65 21.45
## 4      157     -63.65     -20.15 26.10 12.50 58.00 329.00 36.20 14.60 21.60
## 5      159     -63.45     -20.15 27.00 13.20 58.10 346.90 37.60 14.90 22.70
## 6      467     -63.65     -20.05 26.50 12.60 58.20 332.70 36.60 15.00 21.60
## 7      468     -63.55     -20.05 26.40 12.80 58.30 332.70 36.60 14.70 21.90
## 8      469     -63.45     -20.05 26.90 13.10 58.00 342.90 37.40 14.90 22.50
## 9      470     -63.35     -20.05 27.10 13.30 58.30 344.80 37.80 15.00 22.80
## 10     471     -63.25     -20.05 27.15 13.05 58.00 344.45 37.65 15.15 22.50
##    bio_8 bio_9 bio_10 bio_11 bio_12 bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 1  28.70 23.60  28.70   21.3 764.20 127.30    7.3  74.80 371.90  32.40  329.9
## 2  29.10 23.90  29.20   21.5 747.90 125.00    7.0  75.60 365.00  29.50  246.8
## 3  29.15 23.90  29.25   21.5 724.20 121.15    5.9  76.30 354.80  26.70  239.9
## 4  29.10 23.90  29.20   21.5 696.30 116.80    5.0  77.40 343.60  23.80  231.3
## 5  30.10 24.70  30.30   22.1 649.80 110.30    3.0  79.00 323.30  18.00  212.5
## 6  29.50 24.30  29.60   21.8 697.10 117.80    5.0  76.80 343.10  24.30  228.8
## 7  29.40 24.10  29.50   21.7 684.50 115.30    4.8  77.10 337.80  23.60  226.8
## 8  30.00 24.60  30.10   22.0 652.30 110.80    3.8  78.30 324.10  19.60  213.8
## 9  30.30 24.90  30.40   22.3 627.40 107.30    3.0  79.10 314.30  17.80  204.8
## 10 30.30 24.85  30.40   22.3 603.35 103.40    3.0  79.25 303.85  16.65  197.7
##    bio_19 dim_1_bio dim_2_bio  dim_3_bio dim_4_bio
## 1   55.30 -6.632169  1.688637 -0.3910017  2.819546
## 2   52.60 -6.601372  2.056058 -0.5499548  2.771337
## 3   50.05 -6.671913  2.160743 -0.5652803  2.821762
## 4   46.10 -6.757878  2.235183 -0.6127202  2.827303
## 5   42.00 -6.614138  2.930801 -0.5519909  3.193383
## 6   48.80 -6.554173  2.421841 -0.6312663  2.932317
## 7   45.30 -6.703282  2.407938 -0.5964249  2.943381
## 8   42.50 -6.620471  2.822484 -0.5836760  3.146212
## 9   40.30 -6.568640  3.023775 -0.6249385  3.230148
## 10  38.55 -6.545094  3.012958 -0.7564486  3.222247
##                          geometry
## 1  POLYGON ((-64 -20.2, -63.9 ...
## 2  POLYGON ((-63.9 -20.2, -63....
## 3  POLYGON ((-63.8 -20.2, -63....
## 4  POLYGON ((-63.7 -20.2, -63....
## 5  POLYGON ((-63.5 -20.2, -63....
## 6  POLYGON ((-63.7 -20.1, -63....
## 7  POLYGON ((-63.6 -20.1, -63....
## 8  POLYGON ((-63.5 -20.1, -63....
## 9  POLYGON ((-63.4 -20.1, -63....
## 10 POLYGON ((-63.3 -20.1, -63....
pca_future %>% 
    saveRDS(file = shp_preds_future)

Training Models

Set Data

As in previous steps, it is necessary to set inputs and outputs, taking extra care with extension names. a) Input:

# Let's use the PCA-axes built in the last step:
df_var_preditors <- df_potential_predictors[,colnames(df_potential_predictors) %in% var_predictors]
grid_future <- pca_future
  1. Outputs:
# Name the directory to save trained models.
folder_models <- here("output_data/models")
  1. Control Variables:
# Algorithm names to be used in Species Distribution Modeling.
# Run getmethodNames() to unveil which algorithms are available.
algo <- c("mlp", "glmnet","brt")

# Set the threshold criteria.
# 1:sp=se, 2:max(se+sp), 3:min(cost), 4:minROCdist, 5:max(kappa), 6:max(ppv+npv), 7:ppv=npv, 8:max(NMI), 9:max(ccr), 10: prevalence
thresh_criteria <- 2

# Number of runs to each algorithm
n_run <- 10

# Number of folds to crossvalidation
n_folds <- 4

Generate Pseudoabsences

To build models, it is necessary to use pseudoabsences that contrast to presences. Currently, only the ‘random’ method is applied.

df_pseudoabsences <- shp_matrix_pa %>%
  pseudoabsences(df_var_preditors, spp_names, metodo="random") 

It is possible to plot a t-SNE graph to check whether pseudoabsence data clusters into a separate group from presence data.

df_potential_predictors %>% 
  tsne_plot(df_pseudoabsences, spp_names)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <]8;;https://github.com/tidyverse/dplyr/issueshttps://github.com/tidyverse/dplyr/issues]8;;>.
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

Join data

As we are using the sdm package, let’s start to build our models by indicating our input data.

d <- df_species %>% 
  fit_data(df_var_preditors, df_pseudoabsences)
d
## $clssm_mcrp
## class                                 : sdmdata 
## =========================================================== 
## number of species                     :  1 
## species names                         :  clssm_mcrp 
## number of features                    :  4 
## feature names                         :  dim_1_bio, dim_2_bio, dim_3_bio, ... 
## type                                  :  Presence-Background 
## has independet test data?             :  FALSE 
## number of records                     :  440 
## has Coordinates?                      :  FALSE

Training Models

With the data, we can build our models.

df_species %>%
  train_models_to_folder(
      d, 
      algo, 
      n_run, 
      n_folds, 
      folder_models
    )

folder_models %>%
  dir_ls() %>%
  path_file() %>% 
  head() %>%
  as.data.frame()
##            .
## 1 clssm_mcrp
"Number of trained species: " %>%
  paste0( 
    folder_models %>%
      dir_ls() %>% 
      length()
  ) %>%
  print()
## [1] "Number of trained species: 1"

How many models failed?

d %>% 
  model_failures(folder_models)
## [1] species method  success
## <0 rows> (or 0-length row.names)

Threshold visualization

thresholds_models <- spp_names %>% 
  sp_thresh_from_folder(folder_models, thresh_criteria)

thresholds_models_means <- spp_names %>%  
  sp_thresh_mean_from_folder(folder_models, thresholds_models)

thresholds_models_means  
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method   mean
##   <chr>      <chr>   <dbl>
## 1 clssm_mcrp brt    0.501 
## 2 clssm_mcrp glmnet 0.0186
## 3 clssm_mcrp mlp    0.405

Projections

To project our models in space, we need to set where the models were saved (previously set as the folder_models object) and where we want to save our projections.

directory_projections <- here("output_data")

Set up some variables.

df_pa <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_potential_predictors <- df_pa %>% 
  bind_cols(df_var_preditors)

projection_data <- lapply(grid_future, function(x){ x <- as.data.frame(x)
                                           x[!(names(x) %in% c("x_centroid", "y_centroid", "geometry"))]})
projection_data$current <- df_var_preditors

And finally run our projections.

# Project models in scenarios
df_pa %>% predict_to_folder(
      projection_data,
      folder_models, 
      grid_study_area,
      algo, 
      thresh_criteria, 
      directory_projections
    )

Visualizing Results

Presence/Absence maps to each algorithm to the current scenario

predictions_sp <- spp_names %>% 
  sp_predictions_from_folder(directory_projections)

predictions_sp %>% 
  pluck(2) %>%
  distribution_map_algo(grid_study_area, algo)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Presence/Absence Consensus for current scenario

predictions_sp %>% 
  pluck(2) %>%
  distribution_map(grid_study_area)

Frequence map to each algorithm in current scenario

predictions_sp %>% 
  pluck(1) %>%
  distribution_map_algo(grid_study_area, algo)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Frequence consensus map in current scenario

predictions_sp %>% 
  pluck(1) %>%
  distribution_map(grid_study_area)

Frequence consensus map in future ssp245 scenario

predictions_sp$ssp245_freq %>%
  distribution_map(grid_study_area)

Frequence consensus map in future ssp585 scenario

predictions_sp$ssp585_freq %>%
  distribution_map(grid_study_area)

Ensemble maps for current scenario

predictions_sp$current_freq %>%
  distribution_map_ensembles(grid_study_area)
## [[1]]

## 
## [[2]]

Ensemble maps for ssp585 scenario

predictions_sp$ssp585_freq %>%
  distribution_map_ensembles(grid_study_area)
## [[1]]

## 
## [[2]]